Load data
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.1.1
## ✔ dials 1.2.0 ✔ tibble 3.2.1
## ✔ dplyr 1.1.2 ✔ tidyr 1.3.0
## ✔ infer 1.0.4 ✔ tune 1.1.1
## ✔ modeldata 1.1.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.0 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.6
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
Set parameter ranges
# Vectors of all possible conditions combinations
frs <- as.numeric(as.character(unique(CC.TotalData$Flow.rate)))
chls <- as.numeric(as.character(unique(CC.TotalData$Chlorophyll)))
guans <- c(1,2)
lights <- c(1,2)
conditions <- expand.grid(frs,chls,guans,lights)
Loop through combinations
for (i in 1:dim(conditions)[1])
{
velocity <- CC.TotalData$v[
(CC.TotalData$Flow.rate==conditions[i,1] &
CC.TotalData$Chlorophyll==conditions[i,2] &
as.numeric(CC.TotalData$Guano)==conditions[i,3] &
as.numeric(CC.TotalData$Light)==conditions[i,4])]
if (length(velocity) <= 1) {
conditions[i,5] <- NA } else {
c<-cor.test(log10(velocity[1:length(velocity)-1]),
log10(velocity[2:length(velocity)]))
conditions[i,5] <- c$estimate
v0 <- log10(velocity[1:length(velocity)-1])
v1 <- log10(velocity[2:length(velocity)])
df <- data.frame(v0,v1)
fit <- lm(v1 ~ v0 + 1, data = df)
conditions[i,6] <- fit$coefficients[1]
conditions[i,7] <- fit$coefficients[2]
conditions[i,8] <- sd(fit$residuals)
d <- dip.test(velocity)
conditions[i,9] <- d$p.value
hist(log10(velocity),100,
main=d$p.value)
}
}
## n = 258078 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 116154 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 266817 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 211577 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 253471 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 192727 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 271850 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 91500 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 137251 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 282203 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 140189 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 142013 > max_n{n in table} = 72000 -- using that as asymptotic value.
conditions <- setNames(conditions,c("flow.rate","chlorophyll","guano","light","corr.val","slope","intercept","sigma","dip.test"))
Using random forest to fit the missing values for autocorrelation
Idata <- which(!is.na(conditions[,5])) # Index of data
Iblank <- which(is.na(conditions[,5])) # Index of blank data
conditions_data <- conditions[Idata,]
conditions_blank <- conditions[Iblank,]
conditions.rf.slope <- randomForest(slope ~ .,
data=select(conditions_data,c(1,2,3,4,6)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.slope)
varImpPlot(conditions.rf.slope)
conditions_blank$slope <- predict(conditions.rf.slope,conditions_blank[,1:4])
fit.slope <- predict(conditions.rf.slope,conditions_data[,1:4])
plot(fit.slope,conditions_data$slope)
plot(conditions_blank$chlorophyll,conditions_blank$slope)
plot(conditions_blank$flow.rate,conditions_blank$slope)
plot(conditions_blank$light,conditions_blank$slope)
plot(conditions_blank$guano,conditions_blank$slope)
conditions.rf.intercept <- randomForest(intercept ~ .,
data=select(conditions_data,c(1,2,3,4,7)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.intercept)
varImpPlot(conditions.rf.intercept)
conditions_blank$intercept <- predict(conditions.rf.intercept,conditions_blank[,1:4])
fit.intercept <- predict(conditions.rf.intercept,conditions_data[,1:4])
plot(fit.intercept,conditions_data$intercept)
plot(conditions_blank$chlorophyll,conditions_blank$intercept)
plot(conditions_blank$flow.rate,conditions_blank$intercept)
plot(conditions_blank$light,conditions_blank$intercept)
plot(conditions_blank$guano,conditions_blank$intercept)
conditions.rf.sigma <- randomForest(sigma ~ .,
data=select(conditions_data,c(1,2,3,4,8)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.sigma)
varImpPlot(conditions.rf.sigma)
conditions_blank$sigma <- predict(conditions.rf.sigma,conditions_blank[,1:4])
fit.sigma <- predict(conditions.rf.sigma,conditions_data[,1:4])
plot(fit.sigma,conditions_data$sigma)
plot(conditions_blank$chlorophyll,conditions_blank$sigma)
plot(conditions_blank$flow.rate,conditions_blank$sigma)
plot(conditions_blank$light,conditions_blank$sigma)
plot(conditions_blank$guano,conditions_blank$sigma)
Save fit models
save(conditions.rf.slope,conditions.rf.intercept,conditions.rf.sigma, file='notebook13-rf.RData')